#!/usr/bin/env bash
set -uo pipefail

#==============================================================#
# File      :   pg-vacuum
# Ctime     :   2018-05-18
# Mtime     :   2020-12-17
# Desc      :   launch manual vacuum tasks
# Path      :   /pg/bin/pg-vacuum
# Depend    :   psql, monitor views
# Author    :   Vonng(fengruohang@outlook.com)
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#

PROG_NAME="$(basename $0))"
PROG_DIR="$(cd $(dirname $0) && pwd)"

# psql & pg_repack PATH
export PATH=/usr/pgsql/bin:${PATH}

#==============================================================#
#                             Utils                            #
#==============================================================#
# logger functions
function log_debug() {
    [ -t 2 ] && printf "\033[0;34m[$(date "+%Y-%m-%d %H:%M:%S")][DEBUG] $*\033[0m\n" >&2 ||\
     printf "[$(date "+%Y-%m-%d %H:%M:%S")][DEBUG] $*\n" >&2
}
function log_info() {
    [ -t 2 ] && printf "\033[0;32m[$(date "+%Y-%m-%d %H:%M:%S")][INFO] $*\033[0m\n" >&2 ||\
     printf "[$(date "+%Y-%m-%d %H:%M:%S")][INFO] $*\n" >&2
}
function log_warn() {
    [ -t 2 ] && printf "\033[0;33m[$(date "+%Y-%m-%d %H:%M:%S")][WARN] $*\033[0m\n" >&2 ||\
     printf "[$(date "+%Y-%m-%d %H:%M:%S")][INFO] $*\n" >&2
}
function log_error() {
    [ -t 2 ] && printf "\033[0;31m[$(date "+%Y-%m-%d %H:%M:%S")][ERROR] $*\033[0m\n" >&2 ||\
     printf "[$(date "+%Y-%m-%d %H:%M:%S")][INFO] $*\n" >&2
}

# get primary IP address
function local_ip(){
    # ip range in 10.xxx.xxx.xx
    echo $(/sbin/ifconfig | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | grep -Eo '10\.([0-9]*\.){2}[0-9]*')
}


# send mail via mail service
function send_mail(){
    local subject=$1
    local content=$2
    local to=${3-"fengruohang@p1.com"}
    local mail_service="http://10.191.167.134:28888/v1/sendmail/"

    curl -s -d "subject=${subject}&content=${content}&to=${to}" ${mail_service} > /dev/null
}

# slave returns 't', psql access required
function is_slave(){
    local db=$1
    echo $(psql ${db} -AXtqwc "SELECT pg_is_in_recovery();")
}

# kill vacuum queries to avoid contention, psql access required
function kill_queries(){
    local db=$1
    kill_count=$(psql ${db} -qAt <<-'EOF'
    SELECT count(pg_cancel_backend(pid))
    FROM pg_stat_activity
    WHERE state <> 'idle' AND pid <> pg_backend_pid()
          AND (query ~* 'vacuum' or query ~* 'analyze');
EOF
2>/dev/null)
    echo ${kill_count}
}

# return tables age by fullname
function table_age(){
    local db=$1
    local fullname=$2
    echo $(psql ${db} -AXtqwc "SELECT age(relfrozenxid) FROM pg_class WHERE oid = '${fullname}'::RegClass::oid;")
}

# get database's age
function database_age(){
    local db=$1
    echo $(psql ${db} -AXtqwc "SELECT age(datfrozenxid) FROM pg_database WHERE datname = '${db}';")
}


# return aging page / total page
function aging_ratio(){
    local db=$1
    aging_ratio=$(psql ${db} -AXtqw <<-'EOF'
    WITH aging_table AS (SELECT
      c.oid                                              as relid,
      c.relname                                          as relname,
      nsp.nspname,
      nsp.nspname || '.' || c.relname                    as fullname,
      greatest(age(c.relfrozenxid), age(t.relfrozenxid)) as age,
      c.relpages as relpages
    FROM pg_class c
      LEFT JOIN pg_class t ON c.reltoastrelid = t.oid
      LEFT JOIN pg_namespace nsp ON c.relnamespace = nsp.oid
    WHERE c.relkind IN ('r', 'm') AND nsp.nspname NOT IN ('pg_catalog', 'information_schema')),
      aging_page AS (SELECT sum(relpages)::NUMERIC as aging_page FROM aging_table WHERE age > 50000000),
      total_page AS (SELECT sum(relpages)::NUMERIC as total_page FROM aging_table)
    SELECT (100.0 * (aging_page / total_page))::INTEGER as scan_ratio FROM aging_page CROSS JOIN total_page;
EOF
2>/dev/null)
    echo ${aging_ratio}
}

#==============================================================#
#                        Freeze Table                          #
#==============================================================#

#--------------------------------------------------------------#
# Name: freeze_tables
# Desc: repack table via fullname
# Arg1: database_name
# Argv: list of table full name
# Deps: psql
#--------------------------------------------------------------#
# repack single table
function freeze_tables(){
    local db=$1
    shift

    log_info "freeze ${db} tables begin"
    log_info "freeze table list: $@"

    for relname in $@
    do
        # check before age and make sure table exist
        local -i before_age=$(table_age ${db} ${relname})
        if(( $?!=0 )); then
            log_error "table ${relname} not found"
            return 1
        fi

        log_info "freeze table ${db} ${relname} begin. before age: ${before_age}"
        psql ${db} -AXtqw <<-EOF
        SET vacuum_cost_limit=10000;
        SET vacuum_cost_delay='5ms';
        VACUUM FREEZE VERBOSE ${relname};
        ANALYZE VERBOSE ${relname};
EOF
        local -i after_age=$(table_age ${db} ${relname})

        if (( ${after_age} > ${before_age} )) && (( ${after_age} > 100000000 )); then
            log_warn "Warnning! table ${relname} age is still greater than 1,0000,0000"
            return 1
        fi
       log_info "vacuum table ${db} ${relname} done. age: ${before_age} -> ${after_age}"
    done

    log_info "repack ${db} tables done"
}




#--------------------------------------------------------------#
# Name: get_aging_tables
# Desc: find old tables age > 81921024
# Arg1: database_name
# Echo: list of full table name
# Deps: psql, monitor.pg_aging_tables
#--------------------------------------------------------------#
function get_aging_tables(){
    echo $(psql ${1} -AXtqw <<-'EOF'
WITH aging_tables AS (SELECT
  nsp.nspname || '.' || c.relname                    as fullname,
  greatest(age(c.relfrozenxid), age(t.relfrozenxid)) as age,
  round(c.relpages :: NUMERIC / 128, 3)              as size_mb
FROM pg_class c
  LEFT JOIN pg_class t ON c.reltoastrelid = t.oid
  LEFT JOIN pg_namespace nsp ON c.relnamespace = nsp.oid
WHERE c.relkind IN ('r', 'm') AND nsp.nspname NOT IN ('pg_catalog', 'information_schema')
)
SELECT fullname FROM aging_tables WHERE age > 81921024 ORDER BY size_mb ASC;
EOF
)
}

#--------------------------------------------------------------#
# Name: freeze_database
# Desc: find old tables age > 81921024
# Arg1: database_name
# Echo: list of full table name
# Deps: psql, monitor.pg_aging_tables
#--------------------------------------------------------------#
function freeze_database(){
    local db=$1

    if [[ $(is_slave ${db}) == "t" ]]; then
        log_error "slave can't perform repack. do it on master"
        exit
    fi

    local -i aging_threashold=40
    local -i aging_ratio=$(aging_ratio ${db})
    log_info "freeze database ${db} begin, aging_ratio: ${aging_ratio}"

    # freeze
    local -i before_age=$(database_age ${db})
    if(( ${before_age}<81921024 )); then
        log_info "database ${db} current age is ${before_age}, still young, skip"
        return
    fi

    # if thresh% page need be scaned, use full database freeze;
    if (( ${aging_ratio}>${aging_threashold} ))
    then
        log_info "freeze full database ${db} begin. before age: ${before_age}"
        psql ${db} -AXtqw <<-EOF
        SET vacuum_cost_limit=10000;
        SET vacuum_cost_delay='5ms';
        VACUUM FREEZE VERBOSE;
        ANALYZE VERBOSE;
EOF
    else
        log_info "freeze database ${db} aging tables begin. before age: ${before_age}"
        freeze_tables ${db} $(get_aging_tables ${db})
    fi
    local -i after_age=$(database_age ${db})

    # still aged, send mail
    if (( ${after_age} > ${before_age} )) && (( ${after_age} > 100000000 )); then
        log_warn "Warnning! ${db} age is still greater than 1,0000,0000"
        send_mail "PG VACUUM FAILED on ${db}@$(local_ip)" "${before_age} to ${after_age}" "fengruohang@p1.com"
    fi
   log_info "vacuum database ${db} done. age: ${before_age} -> ${after_age}"
}

#--------------------------------------------------------------#
# Name: freeze
# Desc: freeze all database in local cluster
#--------------------------------------------------------------#
function vacuum(){
    local lock_path="/tmp/vacuum.lock"
    log_info "rountine vacuum begin, lock @ ${lock_path}"

    if [ -e ${lock_path} ] && kill -0 $(cat ${lock_path}); then
        log_error "vacuum already running: $(cat ${lock_path})"
        exit
    fi
    trap "rm -f ${lock_path}; exit" INT TERM EXIT
    echo $$ > ${lock_path}


    local databases=$(psql -AXtqwc "SELECT datname FROM pg_database WHERE datname NOT IN ('template0','template1','postgres');")
    log_info "freeze all database in local cluster: ${databases}"
    for database in ${databases}
    do
        freeze_database ${database}
    done
    log_info "freeze all database in local cluster done"

    # remove lock
    rm -f ${lock_path}
    log_info "routine vacuum done"
}


vacuum